In [1]:
# import libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
sns.set(context='notebook')
sns.set_style("whitegrid", {'axes.grid' : False})
plt.tight_layout()
import pickle
import warnings
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, accuracy_score, confusion_matrix 
from sklearn.metrics import precision_recall_curve, auc 
warnings.filterwarnings("ignore")
#import scorecardpy as sc
import plotly.graph_objects as go
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import norm
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from statsmodels.tools import add_constant
import statsmodels.api as sm
In [2]:
#importing the file
df_bs=pd.read_excel(r"C:\Banks\Final_scorecard.xlsx")
df_bs_lgd=df_bs.copy()
In [3]:
#top 5 rows
df_bs.head()
Out[3]:
id time orig_time first_time mat_time res_time balance_time LTV_time interest_rate_time rate_time ... LTV_orig_time Interest_Rate_orig_time state_orig_time hpi_orig_time default_time payoff_time status_time status_check lgd_time recovery_res
0 1 25 -7 25 113 NaN 41303.42 24.498336 9.2 5.44 ... 69.4 9.2 CA 87.03 0 0 0 0 NaN NaN
1 1 26 -7 25 113 NaN 41061.95 24.483867 9.2 5.44 ... 69.4 9.2 CA 87.03 0 0 0 0 NaN NaN
2 1 27 -7 25 113 NaN 40804.42 24.626795 9.2 5.44 ... 69.4 9.2 CA 87.03 0 0 0 0 NaN NaN
3 1 28 -7 25 113 NaN 40483.89 24.735883 9.2 5.44 ... 69.4 9.2 CA 87.03 0 0 0 0 NaN NaN
4 1 29 -7 25 113 NaN 40367.06 24.925476 9.2 5.44 ... 69.4 9.2 CA 87.03 0 0 0 0 NaN NaN

5 rows × 29 columns

In [4]:
#checking null vales in LTV time columns
df_bs['LTV_time'].isna().sum()
Out[4]:
270
In [5]:
#filtering default time
df_bs['status_check']=np.where(df_bs['default_time']==1,1,np.where(df_bs['payoff_time']==1,2,0))
In [6]:
#checking datatype and counts of datasets
df_bs.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 622489 entries, 0 to 622488
Data columns (total 29 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       622489 non-null  int64  
 1   time                     622489 non-null  int64  
 2   orig_time                622489 non-null  int64  
 3   first_time               622489 non-null  int64  
 4   mat_time                 622489 non-null  int64  
 5   res_time                 11434 non-null   float64
 6   balance_time             622489 non-null  float64
 7   LTV_time                 622219 non-null  float64
 8   interest_rate_time       622489 non-null  float64
 9   rate_time                622489 non-null  float64
 10  hpi_time                 622489 non-null  float64
 11  gdp_time                 622489 non-null  float64
 12  uer_time                 622489 non-null  float64
 13  REtype_CO_orig_time      622489 non-null  int64  
 14  REtype_PU_orig_time      622489 non-null  int64  
 15  REtype_SF_orig_time      622489 non-null  int64  
 16  investor_orig_time       622489 non-null  int64  
 17  balance_orig_time        622489 non-null  float64
 18  FICO_orig_time           622489 non-null  int64  
 19  LTV_orig_time            622489 non-null  float64
 20  Interest_Rate_orig_time  622489 non-null  float64
 21  state_orig_time          619660 non-null  object 
 22  hpi_orig_time            622489 non-null  float64
 23  default_time             622489 non-null  int64  
 24  payoff_time              622489 non-null  int64  
 25  status_time              622489 non-null  int64  
 26  status_check             622489 non-null  int32  
 27  lgd_time                 15158 non-null   float64
 28  recovery_res             15158 non-null   float64
dtypes: float64(14), int32(1), int64(13), object(1)
memory usage: 135.4+ MB
In [7]:
#filtering time
df_bs['CHECK1']=np.where(df_bs['time']>=df_bs['first_time'],1,0)
In [8]:
#filtering first time
df_bs['CHECK2']=np.where(df_bs['first_time']>=df_bs['orig_time'],1,0)
In [9]:
#filtering origin time
df_bs['CHECK3']=np.where(df_bs['orig_time']<df_bs['mat_time'],1,0)
In [10]:
#filtering balance time
df_bs['CHECK4']=np.where(df_bs['balance_time']<=df_bs['balance_orig_time'],1,0)
In [11]:
#filtering status time
df_bs['CHECK5']=np.where(df_bs['status_time']==df_bs['status_check'],1,0)
In [12]:
#coverting null to -99
df_bs['LTV_time']=df_bs['LTV_time'].replace(np.nan,-99)
In [13]:
#filtering LTV time
df_bs['CHECK6']=np.where(df_bs['LTV_time']==-99,1,0)
In [14]:
#copy datasets 
df_bs1=df_bs.copy()
In [15]:
#identifying ids that needs to be removed
remove_id1=list(df_bs1[((df_bs1.CHECK1 == 0) | (df_bs1.CHECK2==0) | (df_bs1.CHECK3==0) | (df_bs1.CHECK6==1) )]['id'].unique())
In [16]:
#removing ids 
df_bs=df_bs[~df_bs.id.isin(remove_id1)]
In [17]:
#removing extreme ID values
remove_id2=list(df_bs[(df_bs.interest_rate_time == 0) | (df_bs.interest_rate_time  == 37.500)]['id'].unique())
In [18]:
#removing IDs
df_bs=df_bs[~df_bs.id.isin(remove_id2)]
In [19]:
#creating vintage field
df_bs['vintage']=np.where(df_bs['orig_time']<=0,0,np.where(df_bs['orig_time']<=12,1,np.where(df_bs['orig_time']<=24,2,np.where(df_bs['orig_time']<=36,3,np.where(df_bs['orig_time']<=48,4,5)))))
In [20]:
#making pivot of average of gdp_time, hpi_time,interest_rate_time,uer_time columns
average_pivot=pd.pivot_table(df_bs, index=['time'], aggfunc= {'hpi_time': 'mean', 'gdp_time': 'mean','uer_time': 'mean', 'interest_rate_time': 'mean'}).reset_index()
In [21]:
#ploting gdp_time against time
average_pivot1 = pd.melt(average_pivot, id_vars='time', value_vars=average_pivot.columns[:-3])

# plotly express
fig = px.line(average_pivot1, x='time', y='value', color='variable')
fig.show()
In [22]:
#ploting interest_rate_time against time
average_pivot2 = pd.melt(average_pivot, id_vars='time', value_vars=average_pivot.columns[-2:-1])

# plotly express
fig = px.line(average_pivot2, x='time', y='value', color='variable')
fig.show()
In [23]:
#ploting hpi_time against time
average_pivot3 = pd.melt(average_pivot, id_vars='time', value_vars=average_pivot.columns[-3:-2])

# plotly express
fig = px.line(average_pivot3, x='time', y='value', color='variable')
fig.show()
In [24]:
#ploting uer_time against time
average_pivot4 = pd.melt(average_pivot, id_vars='time', value_vars=average_pivot.columns[-1:])

# plotly express
fig = px.line(average_pivot4, x='time', y='value', color='variable')
fig.show()
In [25]:
#adding standard deviation of gdp_time, hpi_time,interest_rate_time,uer_time columns to dataframe
cols = list(average_pivot.columns)
cols.remove('time')


for col in cols:
    col_std = col + '_std'
    average_pivot[col_std] = (average_pivot[col] - average_pivot[col].mean())/average_pivot[col].std()
In [26]:
#finding average value of FICO_orig_time and LTV_orig_time based on vintage
fico_pivot=pd.pivot_table(df_bs, index=['vintage'], aggfunc= {'FICO_orig_time': 'mean', 'LTV_orig_time': 'mean'}).reset_index()
In [27]:
#renaming the column vintage to origin_time 
fico_pivot=fico_pivot.rename(columns={'vintage':'origin_time'})
In [28]:
#obtaining min max value of columns
cols = list(fico_pivot.columns)
cols.remove('origin_time')


for col in cols:
    col_std = col + '_minmax'
    fico_pivot[col_std] = (fico_pivot[col] - fico_pivot[col].min())/(fico_pivot[col].max() - fico_pivot[col].min())
In [29]:
#filtering the colums
fico_pivot_graph=fico_pivot.loc[:,['origin_time','FICO_orig_time_minmax','LTV_orig_time_minmax']]
In [30]:
#ploting line chart between FICO_orig_time_minmax vs LTV_orig_time_minmax
fico_pivot_graph = pd.melt(fico_pivot_graph, id_vars='origin_time', value_vars=fico_pivot_graph)

# plotly express
fig = px.line(fico_pivot_graph, x='origin_time', y='value', color='variable')
fig.show()
In [31]:
#creating mob column
df_bs['mob']=np.where(round(df_bs['time']-df_bs['orig_time'],0)>40,40,(df_bs['time']-df_bs['orig_time']))
In [32]:
#creating ID index column
df_bs['Id_index']=df_bs['id'].astype(str) +'-'+df_bs['time'].astype(str)
In [33]:
#checking average default based month on books
mob_pivot=pd.pivot_table(df_bs, index=['mob'], aggfunc= {'default_time': 'mean'}).reset_index()
In [34]:
#ploting graph default with mob 
mob_pivot_graph = pd.melt(mob_pivot, id_vars='mob', value_vars=mob_pivot)

# plotly express
fig = px.line(mob_pivot_graph , x='mob', y='value', color='variable')
fig.show()
In [35]:
#creating average of default_time based on time
period_pivot=pd.pivot_table(df_bs, index=['time'], aggfunc= {'default_time': 'mean'}).reset_index()
In [36]:
#renaming column time with period
period_pivot=period_pivot.rename(columns={'time':'period'})
In [37]:
#plotting default_time based on period
period_pivot_graph = pd.melt(period_pivot, id_vars='period', value_vars=period_pivot)

# plotly express
fig = px.line(period_pivot_graph , x='period', y='value', color='variable')
fig.show()
In [38]:
#finding average of default_time vs vintage
vintage_pivot=pd.pivot_table(df_bs, index=['vintage'], aggfunc= {'default_time': 'mean'}).reset_index()
In [39]:
#ploting default_time based on vintage
vintage_pivot_graph = pd.melt(vintage_pivot, id_vars='vintage', value_vars=vintage_pivot)

# plotly express
fig = px.bar(vintage_pivot_graph , x='vintage', y='value', color='variable')
fig.show()
In [40]:
#deriving equity_time
df_bs['equity_time']=1-(df_bs['LTV_time']/100)
In [41]:
#deriving interest_rate_time
df_bs['interest_rate_time_new']=df_bs['interest_rate_time']/(12*100)
In [42]:
#deriving age_time
df_bs['age_time']=df_bs['mat_time']-df_bs['orig_time']
In [43]:
#deriving annuity
df_bs['annuity']=(df_bs['balance_time']*df_bs['interest_rate_time_new'])/(1-(1+df_bs['interest_rate_time_new'])**(-df_bs['age_time']))
In [44]:
#deriving n2
df_bs['n2']=df_bs['time']-df_bs['orig_time']
In [45]:
#deriving FV_balance
df_bs['FV_balance']=df_bs['balance_orig_time']*(1+df_bs['interest_rate_time_new'])**df_bs['n2']
In [46]:
#deriving FV_annuity
df_bs['FV_annuity']=df_bs['annuity']*((1 + df_bs['interest_rate_time_new'])**(df_bs['n2'])-1)/df_bs['interest_rate_time_new']
In [47]:
#deriving balance_scheduled_time
df_bs['balance_scheduled_time']=df_bs['FV_balance']-df_bs['FV_annuity']
In [48]:
#deriving property_orig_time
df_bs['property_orig_time']=np.where(df_bs['balance_orig_time']/(df_bs['LTV_orig_time']/100)<100000,100000,df_bs['balance_orig_time']/(df_bs['LTV_orig_time']/100))
In [49]:
#deriving cep_time
df_bs['cep_time']=(df_bs['balance_scheduled_time']-df_bs['balance_time'])/df_bs['property_orig_time']
In [50]:
#finding averge of cep_time based on time
cep_time_pivot=pd.pivot_table(df_bs, index=['time'], aggfunc= {'cep_time': 'mean'}).reset_index()
In [51]:
#finding averge of cep_time based on time
cep_time_pivot_graph = pd.melt(cep_time_pivot, id_vars='time', value_vars=cep_time_pivot)

# plotly express
fig = px.line(cep_time_pivot_graph , x='time', y='value', color='variable')
fig.show()
In [52]:
#creating shapshot datasets
shapshot_pivot=pd.pivot_table(df_bs, index=['id'], aggfunc= {'mat_time': 'count', 'orig_time': 'mean','first_time': 'mean', 'time': 'max', 'status_time': 'sum'}).reset_index()
In [53]:
#renaming mat_time to no_of_obs
shapshot_pivot=shapshot_pivot.rename(columns={'mat_time':'no_of_obs'})
In [54]:
#creating filter based on time and first_time
shapshot_pivot['check']=np.where(shapshot_pivot['time']-shapshot_pivot['first_time']+1==shapshot_pivot['no_of_obs'],1,0)
In [55]:
#including check value equal to 1
shapshot_pivot=shapshot_pivot[shapshot_pivot['check']==1].reset_index()
In [56]:
#dropping check and index columns
shapshot_pivot=shapshot_pivot.drop(['check','index'], axis=1)
In [57]:
#renaming  column time to last_status_time
shapshot_pivot=shapshot_pivot.rename(columns={'time':'last_status_time'})
In [58]:
#creating status_snapshot1 column filtering last_status_time as 1
shapshot_pivot['status_snapshot1']=np.where(1<shapshot_pivot['first_time'],"",np.where(1>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [59]:
#creating status_perf_end1 column filtering last_status_time as 13
shapshot_pivot['status_perf_end1']=np.where(13<shapshot_pivot['first_time'],"",np.where(13>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [60]:
shapshot_pivot['status_snapshot2']=np.where(5<shapshot_pivot['first_time'],"",np.where(5>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [61]:
shapshot_pivot['status_perf_end2']=np.where(17<shapshot_pivot['first_time'],"",np.where(17>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [62]:
shapshot_pivot['status_snapshot3']=np.where(9<shapshot_pivot['first_time'],"",np.where(9>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [63]:
shapshot_pivot['status_perf_end3']=np.where(21<shapshot_pivot['first_time'],"",np.where(21>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [64]:
shapshot_pivot['status_snapshot4']=np.where(13<shapshot_pivot['first_time'],"",np.where(13>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [65]:
shapshot_pivot['status_perf_end4']=np.where(25<shapshot_pivot['first_time'],"",np.where(25>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [66]:
shapshot_pivot['status_snapshot5']=np.where(17<shapshot_pivot['first_time'],"",np.where(17>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [67]:
shapshot_pivot['status_perf_end5']=np.where(29<shapshot_pivot['first_time'],"",np.where(29>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [68]:
shapshot_pivot['status_snapshot6']=np.where(21<shapshot_pivot['first_time'],"",np.where(21>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [69]:
shapshot_pivot['status_perf_end6']=np.where(33<shapshot_pivot['first_time'],"",np.where(33>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [70]:
shapshot_pivot['status_snapshot7']=np.where(25<shapshot_pivot['first_time'],"",np.where(25>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [71]:
shapshot_pivot['status_perf_end7']=np.where(37<shapshot_pivot['first_time'],"",np.where(37>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [72]:
shapshot_pivot['status_snapshot8']=np.where(29<shapshot_pivot['first_time'],"",np.where(29>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [73]:
shapshot_pivot['status_perf_end8']=np.where(41<shapshot_pivot['first_time'],"",np.where(41>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [74]:
shapshot_pivot['status_snapshot9']=np.where(33<shapshot_pivot['first_time'],"",np.where(33>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [75]:
shapshot_pivot['status_perf_end9']=np.where(45<shapshot_pivot['first_time'],"",np.where(45>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [76]:
shapshot_pivot['status_snapshot10']=np.where(37<shapshot_pivot['first_time'],"",np.where(37>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [77]:
shapshot_pivot['status_perf_end10']=np.where(49<shapshot_pivot['first_time'],"",np.where(49>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [78]:
shapshot_pivot['status_snapshot11']=np.where(41<shapshot_pivot['first_time'],"",np.where(41>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [79]:
shapshot_pivot['status_perf_end11']=np.where(53<shapshot_pivot['first_time'],"",np.where(53>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [80]:
shapshot_pivot['status_snapshot12']=np.where(45<shapshot_pivot['first_time'],"",np.where(45>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [81]:
shapshot_pivot['status_perf_end12']=np.where(57<shapshot_pivot['first_time'],"",np.where(57>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [82]:
shapshot_pivot['status_snapshot13']=np.where(48<shapshot_pivot['first_time'],"",np.where(48>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [83]:
shapshot_pivot['status_perf_end13']=np.where(60<shapshot_pivot['first_time'],"",np.where(60>=shapshot_pivot['last_status_time'],shapshot_pivot['status_time'],0))
In [84]:
#filtering columns
shapshot_base=shapshot_pivot.loc[:,['id', 'first_time', 'no_of_obs', 'orig_time', 'status_time',
       'last_status_time']]
In [85]:
#appending 13 times shapshot data for 13 windows
shapshot_df=pd.concat([shapshot_base,shapshot_base,shapshot_base,shapshot_base,shapshot_base,shapshot_base,shapshot_base,shapshot_base,shapshot_base,shapshot_base,shapshot_base,shapshot_base,shapshot_base]).reset_index(drop=True)
In [86]:
#creating 13 datasets based on windows
shapshot1=shapshot_pivot[['status_snapshot1', 'status_perf_end1']]
shapshot1['window']=1
shapshot1['snapshot']=1
shapshot1['perf_end']=13
shapshot2=shapshot_pivot[['status_snapshot2', 'status_perf_end2']]
shapshot2['window']=2
shapshot2['snapshot']=5

shapshot2['perf_end']=17
shapshot3=shapshot_pivot[['status_snapshot3', 'status_perf_end3']]
shapshot3['window']=3
shapshot3['snapshot']=9
shapshot3['perf_end']=21
shapshot4=shapshot_pivot[['status_snapshot4', 'status_perf_end4']]
shapshot4['window']=4
shapshot4['snapshot']=13
shapshot4['perf_end']=25
shapshot5=shapshot_pivot[['status_snapshot5', 'status_perf_end5']]
shapshot5['window']=5
shapshot5['snapshot']=17
shapshot5['perf_end']=29
shapshot6=shapshot_pivot[['status_snapshot6', 'status_perf_end6']]
shapshot6['window']=6
shapshot6['snapshot']=21
shapshot6['perf_end']=33
shapshot7=shapshot_pivot[['status_snapshot7', 'status_perf_end7']]
shapshot7['window']=7
shapshot7['snapshot']=25
shapshot7['perf_end']=37
shapshot8=shapshot_pivot[['status_snapshot8', 'status_perf_end8']]
shapshot8['window']=8
shapshot8['snapshot']=29
shapshot8['perf_end']=41
shapshot9=shapshot_pivot[['status_snapshot9', 'status_perf_end9']]
shapshot9['window']=9
shapshot9['snapshot']=33
shapshot9['perf_end']=45
shapshot10=shapshot_pivot[['status_snapshot10', 'status_perf_end10']]
shapshot10['window']=10
shapshot10['snapshot']=37
shapshot10['perf_end']=49
shapshot11=shapshot_pivot[['status_snapshot11', 'status_perf_end11']]
shapshot11['window']=11
shapshot11['snapshot']=41
shapshot11['perf_end']=53
shapshot12=shapshot_pivot[['status_snapshot12', 'status_perf_end12']]
shapshot12['window']=12
shapshot12['snapshot']=45
shapshot12['perf_end']=57
shapshot13=shapshot_pivot[['status_snapshot13', 'status_perf_end13']]
shapshot13['window']=13
shapshot13['snapshot']=49
shapshot13['perf_end']=60
In [87]:
#appending all datasets
all_dfs = [shapshot1,
shapshot2,
shapshot3,
shapshot4,
shapshot5,
shapshot6,
shapshot7,
shapshot8,
shapshot9,
shapshot10,
shapshot11,
shapshot12,
shapshot13]

for df in all_dfs:
    df.columns = ['status_snapshot', 'status_perf_end','window','snapshot','perf_end']

status_df=pd.concat(all_dfs).reset_index(drop=True)
In [88]:
#merging both datasets to get shapshot status
snapshot_status_df=pd.concat([shapshot_df,status_df], axis=1)
In [89]:
# converting status_snapshot column to interger
snapshot_status_df['status_snapshot'] = pd.to_numeric(snapshot_status_df['status_snapshot'])
In [90]:
# converting status_perf_end column to interger
snapshot_status_df['status_perf_end'] = pd.to_numeric(snapshot_status_df['status_perf_end'])
In [91]:
#creating keep1 filter 
snapshot_status_df['keep1']=np.where((snapshot_status_df['status_snapshot']==0)&(snapshot_status_df['status_perf_end'].isin([0,1,2])),"Yes","No")
In [92]:
#creating keep2 filter 
snapshot_status_df['keep2']=np.where(snapshot_status_df['snapshot']>snapshot_status_df['last_status_time'],"No","Yes")
In [93]:
#filtering snapshot status datasets based keep1 and keep2
snapshot_status_df=snapshot_status_df[(snapshot_status_df['keep1']=='Yes') & (snapshot_status_df['keep2']=='Yes')].reset_index()
In [94]:
#deleting index column from dataframe
del snapshot_status_df['index']
In [95]:
#creaing mob column on snapshot status dataframe
snapshot_status_df['mob']=round(snapshot_status_df['snapshot']-snapshot_status_df['orig_time'],0).astype(int)
In [96]:
#creaing vintage column on snapshot status dataframe
snapshot_status_df['vintage']=np.where(snapshot_status_df['orig_time']<=0,0,np.where(snapshot_status_df['orig_time']<=12,1,np.where(snapshot_status_df['orig_time']<=24,2,np.where(snapshot_status_df['orig_time']<=36,3,np.where(snapshot_status_df['orig_time']<=48,4,5)))))
In [97]:
#creaing Seasoned column on snapshot status dataframe
snapshot_status_df['Seasoned']=np.where(snapshot_status_df['mob']>=6,"Yes","No")
In [98]:
#filtering snapshot status dataframe with Seasoned equal to Yes
snapshot_status_df=snapshot_status_df[snapshot_status_df['Seasoned']=='Yes']
In [99]:
#snapshot_status_df=snapshot_status_df.reset_index(drop=True)
In [100]:
#creating ID index column
snapshot_status_df['Id_index']=snapshot_status_df['id'].astype(str) +'-'+snapshot_status_df['snapshot'].astype(str)
In [101]:
#merging actual dataset with shapshot datasets
merged_data=pd.merge(snapshot_status_df, df_bs[['Id_index','LTV_time', 'interest_rate_time','hpi_time','gdp_time','uer_time','FICO_orig_time','LTV_orig_time','hpi_orig_time','equity_time','cep_time']], on = 'Id_index', how = 'left')
In [102]:
#creating partitioning based on window
merged_data['partitioning']=np.where(merged_data['window']<12,"in_sample","out_sample")
In [103]:
merged_data['status_perf_end']=np.where(merged_data['status_perf_end']==2,0,merged_data['status_perf_end'])
In [104]:
#creating out of sample datasets
out_sample_df=merged_data[merged_data['partitioning']=='out_sample']
In [105]:
#creating in sample datasets
in_sample_df=merged_data[merged_data['partitioning']=='in_sample']
In [106]:
#filtering columns 
in_sample_df=in_sample_df.loc[:,['status_perf_end', 
         'mob', 'vintage',
       'LTV_time', 'interest_rate_time', 'hpi_time', 'gdp_time',
       'uer_time', 'FICO_orig_time', 'LTV_orig_time', 'hpi_orig_time',
       'equity_time', 'cep_time']]
In [107]:
#create one hot encoding for vintage column
in_sample_vintage=pd.get_dummies(in_sample_df.vintage, prefix='vintage')
In [108]:
#adding one hot encoding for vintage columns to in sample datasets
in_sample_df=pd.concat([in_sample_df,in_sample_vintage],axis=1)
In [109]:
#droping vintage columns
in_sample_df=in_sample_df.drop(['vintage'], axis=1)
In [110]:
#creating mob^2 columns
in_sample_df['mob2'] = in_sample_df['mob']*in_sample_df['mob']
In [111]:
#creating independent features
features=in_sample_df.drop(['status_perf_end'], axis=1)
In [112]:
#create train test split
X_train, X_test, Y_train, Y_test  = train_test_split(features,in_sample_df['status_perf_end'], test_size=0.3, random_state=42)
In [ ]:
 
In [113]:
#Implementing Logistic Regression
X_train = add_constant(X_train)
X_test = add_constant(X_test)
logit_model=sm.Logit(Y_train,X_train)
result=logit_model.fit()
print(result.summary())
Optimization terminated successfully.
         Current function value: 0.490285
         Iterations 15
                           Logit Regression Results                           
==============================================================================
Dep. Variable:        status_perf_end   No. Observations:                47831
Model:                          Logit   Df Residuals:                    47816
Method:                           MLE   Df Model:                           14
Date:                Sat, 15 Jul 2023   Pseudo R-squ.:                  0.1051
Time:                        06:53:08   Log-Likelihood:                -23451.
converged:                       True   LL-Null:                       -26206.
Covariance Type:            nonrobust   LLR p-value:                     0.000
======================================================================================
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 -1.2365   1.17e+06  -1.06e-06      1.000   -2.29e+06    2.29e+06
mob                   -0.0191      0.007     -2.682      0.007      -0.033      -0.005
LTV_time              -0.0123   1417.569  -8.67e-06      1.000   -2778.397    2778.372
interest_rate_time     0.1154      0.008     14.209      0.000       0.099       0.131
hpi_time              -0.0030      0.001     -2.221      0.026      -0.006      -0.000
gdp_time              -0.0280      0.006     -4.873      0.000      -0.039      -0.017
uer_time              -0.1267      0.011    -11.434      0.000      -0.148      -0.105
FICO_orig_time        -0.0044      0.000    -23.500      0.000      -0.005      -0.004
LTV_orig_time          0.0109      0.004      3.071      0.002       0.004       0.018
hpi_orig_time          0.0211      0.002     13.660      0.000       0.018       0.024
equity_time           -1.2364   1.39e+05  -8.86e-06      1.000   -2.73e+05    2.73e+05
cep_time              -8.2338      0.476    -17.315      0.000      -9.166      -7.302
vintage_0             -0.0685   1.63e+06  -4.21e-08      1.000   -3.19e+06    3.19e+06
vintage_1             -0.3110   1.63e+06  -1.91e-07      1.000   -3.19e+06    3.19e+06
vintage_2             -0.5454   1.63e+06  -3.36e-07      1.000   -3.19e+06    3.19e+06
vintage_3             -0.3115   1.63e+06  -1.92e-07      1.000   -3.19e+06    3.19e+06
mob2                   0.0006      0.000      4.675      0.000       0.000       0.001
======================================================================================
In [114]:
#Make prediction for the test data
y_pred=result.predict(X_test)

result.params
Out[114]:
const                -1.236509
mob                  -0.019074
LTV_time             -0.012296
interest_rate_time    0.115389
hpi_time             -0.003036
gdp_time             -0.028007
uer_time             -0.126678
FICO_orig_time       -0.004416
LTV_orig_time         0.010892
hpi_orig_time         0.021056
equity_time          -1.236382
cep_time             -8.233776
vintage_0            -0.068478
vintage_1            -0.311044
vintage_2            -0.545446
vintage_3            -0.311538
mob2                  0.000629
dtype: float64
In [115]:
#ROC for test data
y_pred_test = result.predict(X_test)
fpr, tpr, _ = metrics.roc_curve(Y_test,  y_pred_test)

#create ROC curve
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [116]:
#auc value for test data
auc_test = metrics.roc_auc_score(Y_test,  y_pred_test)
In [117]:
#ROC for train data
y_pred_train = result.predict(X_train)
fpr, tpr, _ = metrics.roc_curve(Y_train,  y_pred_train)

#create ROC curve
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [118]:
#auc value for test data
auc_train = metrics.roc_auc_score(Y_train,  y_pred_train)
In [119]:
#finding optimal value
factor=20/np.log(2)
offset=600-factor*np.log(50)
X_train['pred']=y_pred_train
fpr, tpr, thresholds = metrics.roc_curve(Y_train,y_pred_train)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
In [120]:
optimal_threshold
Out[120]:
0.24795389305811505
In [121]:
auc_test
Out[121]:
0.7333429376387268
In [122]:
auc_train
Out[122]:
0.731863825480699
In [123]:
#Creating confusion matrix for Test dataset
fig, ax = plt.subplots(figsize=(8,6), dpi=100)
conf_matrix = confusion_matrix(Y_train,(y_pred_train > 0.2405))
display = ConfusionMatrixDisplay(conf_matrix)
ax.set(title='Confusion Matrix for Train data')
display.plot(ax=ax);
In [124]:
#Creating classification_repor for Test dataset
print(classification_report(Y_train,(y_pred_train > 0.2405)))
              precision    recall  f1-score   support

         0.0       0.88      0.64      0.74     36483
         1.0       0.38      0.72      0.50     11348

    accuracy                           0.66     47831
   macro avg       0.63      0.68      0.62     47831
weighted avg       0.76      0.66      0.68     47831

In [125]:
#Creating confusion matrix for Train dataset
fig, ax = plt.subplots(figsize=(8,6), dpi=100)
conf_matrix = confusion_matrix(Y_test,(y_pred_test > 0.2405))
display = ConfusionMatrixDisplay(conf_matrix)
ax.set(title='Confusion Matrix for Train data')
display.plot(ax=ax);
In [126]:
#Creating classification_repor for Test dataset
print(classification_report(Y_test,(y_pred_test > 0.2405)))
              precision    recall  f1-score   support

         0.0       0.88      0.64      0.74     15673
         1.0       0.38      0.72      0.50      4827

    accuracy                           0.66     20500
   macro avg       0.63      0.68      0.62     20500
weighted avg       0.76      0.66      0.68     20500

In [127]:
#filtering the columns
out_sample_df=out_sample_df.loc[:,['status_perf_end', 
         'mob', 'vintage',
       'LTV_time', 'interest_rate_time', 'hpi_time', 'gdp_time',
       'uer_time', 'FICO_orig_time', 'LTV_orig_time', 'hpi_orig_time',
       'equity_time', 'cep_time']]
In [128]:
#creating one hot encoding for vintage
out_sample_vintage=pd.get_dummies(out_sample_df.vintage, prefix='vintage')
out_sample_df=out_sample_df.drop(['vintage'], axis=1)
In [129]:
#adding vintage one hot encoding to out of sample data
out_sample_df=pd.concat([out_sample_df,out_sample_vintage],axis=1)
In [130]:
#creating mob^2 column
out_sample_df['mob2'] = out_sample_df['mob']*out_sample_df['mob']
In [131]:
#creating target variable
Y_valid=out_sample_df['status_perf_end']
In [132]:
#creating validation features 
X_valid=out_sample_df.drop(['status_perf_end','vintage_4'], axis=1)
In [133]:
X_valid.head()
Out[133]:
mob LTV_time interest_rate_time hpi_time gdp_time uer_time FICO_orig_time LTV_orig_time hpi_orig_time equity_time cep_time vintage_0 vintage_1 vintage_2 vintage_3 mob2
68331 52 26.100976 9.20 154.87 1.639155 9.1 715 69.4 87.03 0.738990 0.123538 1 0 0 0 2704
68332 24 114.628930 7.53 154.87 1.639155 9.1 609 90.0 208.86 -0.146289 -0.064982 0 0 1 0 576
68333 23 68.140143 7.56 154.87 1.639155 9.1 698 51.6 216.77 0.318599 -0.032762 0 0 1 0 529
68334 22 72.526420 5.99 154.87 1.639155 9.1 654 55.1 221.91 0.274736 -0.021638 0 0 1 0 484
68335 22 102.553080 8.30 154.87 1.639155 9.1 574 75.0 221.91 -0.025531 -0.051175 0 0 1 0 484
In [ ]:

In [134]:
#Make prediction for the validation data
X_valid = add_constant(X_valid)
y_pred_valid=result.predict(X_valid)
In [135]:
#ROC for validation data
fpr, tpr, _ = metrics.roc_curve(Y_valid,  y_pred_valid)

#create ROC curve
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [136]:
#AUC value for validation
auc_valid = metrics.roc_auc_score(Y_valid,  y_pred_valid)
In [137]:
auc_valid
Out[137]:
0.6660168333832398
In [138]:
#Creating classification_report for validation dataset
print(classification_report(Y_valid, ( y_pred_valid > 0.2405)))
              precision    recall  f1-score   support

         0.0       0.92      0.49      0.64     15897
         1.0       0.20      0.74      0.31      2727

    accuracy                           0.52     18624
   macro avg       0.56      0.61      0.47     18624
weighted avg       0.81      0.52      0.59     18624

In [139]:
#Creating confusion matrix for validation dataset
fig, ax = plt.subplots(figsize=(8,6), dpi=100)
conf_matrix = confusion_matrix(Y_valid, ( y_pred_valid > 0.2405))
display = ConfusionMatrixDisplay(conf_matrix)
ax.set(title='Confusion Matrix for Train data')
display.plot(ax=ax);
In [140]:
#Generating score by factor and offset 
X_train['score']=factor*np.log((1-X_train['pred'])/X_train['pred'])+offset
X_test['pred']=y_pred_test
X_valid['pred']=y_pred_valid
X_test['score']=factor*np.log((1-X_test['pred'])/X_test['pred'])+offset
X_valid['score']=factor*np.log((1-X_valid['pred'])/X_valid['pred'])+offset
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [141]:
#taking subset of columns for hybrid models
X_train_hybrid=X_train[['const', 'mob', 'LTV_time', 'interest_rate_time',
        'FICO_orig_time',
       'cep_time', 'vintage_0', 'vintage_1',
       'vintage_2']]
In [142]:
X_test_hybrid=X_test[['const', 'mob', 'LTV_time', 'interest_rate_time',
        'FICO_orig_time',
       'cep_time', 'vintage_0', 'vintage_1',
       'vintage_2']]
In [143]:
X_valid_hybrid=X_valid[['const', 'mob', 'LTV_time', 'interest_rate_time',
        'FICO_orig_time',
       'cep_time', 'vintage_0', 'vintage_1',
       'vintage_2']]
In [144]:
#Implementing Logistic Regression
logit_model1=sm.Logit(Y_train,X_train_hybrid)
result1=logit_model1.fit()
print(result1.summary())
Optimization terminated successfully.
         Current function value: 0.497924
         Iterations 7
                           Logit Regression Results                           
==============================================================================
Dep. Variable:        status_perf_end   No. Observations:                47831
Model:                          Logit   Df Residuals:                    47822
Method:                           MLE   Df Model:                            8
Date:                Sat, 15 Jul 2023   Pseudo R-squ.:                 0.09120
Time:                        06:53:12   Log-Likelihood:                -23816.
converged:                       True   LL-Null:                       -26206.
Covariance Type:            nonrobust   LLR p-value:                     0.000
======================================================================================
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 -0.1476      0.170     -0.870      0.384      -0.480       0.185
mob                   -0.0416      0.003    -16.523      0.000      -0.047      -0.037
LTV_time               0.0142      0.001     20.442      0.000       0.013       0.016
interest_rate_time     0.1230      0.008     15.698      0.000       0.108       0.138
FICO_orig_time        -0.0040      0.000    -21.601      0.000      -0.004      -0.004
cep_time              -6.2148      0.366    -16.985      0.000      -6.932      -5.498
vintage_0             -0.4188      0.105     -3.999      0.000      -0.624      -0.214
vintage_1             -0.8207      0.068    -12.158      0.000      -0.953      -0.688
vintage_2             -0.2312      0.030     -7.822      0.000      -0.289      -0.173
======================================================================================
In [145]:
#Make prediction for the test data
y_test_hybrid_pred=result1.predict(X_test_hybrid)

result1.params
Out[145]:
const                -0.147553
mob                  -0.041605
LTV_time              0.014178
interest_rate_time    0.122985
FICO_orig_time       -0.003999
cep_time             -6.214779
vintage_0            -0.418835
vintage_1            -0.820746
vintage_2            -0.231168
dtype: float64
In [146]:
#ROC curve for test
y_test_hybrid_pred = result1.predict(X_test_hybrid)
fpr, tpr, _ = metrics.roc_curve(Y_test, y_test_hybrid_pred)

#create ROC curve
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [147]:
#ROC curve for train
y_train_hybrid_pred = result1.predict(X_train_hybrid)
fpr, tpr, _ = metrics.roc_curve(Y_train, y_train_hybrid_pred)

#create ROC curve
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [148]:
#ROC curve for validation
y_valid_hybrid_pred = result1.predict(X_valid_hybrid)
fpr, tpr, _ = metrics.roc_curve(Y_valid, y_valid_hybrid_pred)

#create ROC curve
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [149]:
#finding optimal threshold
fpr, tpr, thresholds = metrics.roc_curve(Y_train,y_train_hybrid_pred)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
In [150]:
optimal_threshold
Out[150]:
0.22349252790154836
In [151]:
#AUC for train, test  and validation
auc_test_hybrid = metrics.roc_auc_score(Y_test, y_test_hybrid_pred )
auc_train_hybrid = metrics.roc_auc_score(Y_train, y_train_hybrid_pred )
auc_valid_hybrid = metrics.roc_auc_score(Y_valid, y_valid_hybrid_pred )
In [ ]:
 
In [ ]:
 
In [152]:
#Creating confusion matrix
fig, ax = plt.subplots(figsize=(8,6), dpi=100)
conf_matrix = confusion_matrix(Y_train,(y_train_hybrid_pred  > 0.2273))
display = ConfusionMatrixDisplay(conf_matrix)
ax.set(title='Confusion Matrix for Train data')
display.plot(ax=ax);
In [153]:
#Creating classification report
print(classification_report(Y_train,(y_train_hybrid_pred  > 0.2273)))
              precision    recall  f1-score   support

         0.0       0.88      0.61      0.72     36483
         1.0       0.37      0.73      0.49     11348

    accuracy                           0.64     47831
   macro avg       0.62      0.67      0.60     47831
weighted avg       0.76      0.64      0.66     47831

In [154]:
print(classification_report(Y_test,(y_test_hybrid_pred > 0.2273)))
              precision    recall  f1-score   support

         0.0       0.88      0.60      0.72     15673
         1.0       0.36      0.73      0.49      4827

    accuracy                           0.64     20500
   macro avg       0.62      0.67      0.60     20500
weighted avg       0.76      0.64      0.66     20500

In [155]:
#Creating confusion matrix
fig, ax = plt.subplots(figsize=(8,6), dpi=100)
conf_matrix = confusion_matrix(Y_test,(y_test_hybrid_pred > 0.2273))
display = ConfusionMatrixDisplay(conf_matrix)
ax.set(title='Confusion Matrix for Test data')
display.plot(ax=ax);
In [156]:
print(classification_report(Y_valid,(y_valid_hybrid_pred  > 0.2288)))
              precision    recall  f1-score   support

         0.0       0.91      0.62      0.74     15897
         1.0       0.23      0.64      0.33      2727

    accuracy                           0.62     18624
   macro avg       0.57      0.63      0.54     18624
weighted avg       0.81      0.62      0.68     18624

In [157]:
#Creating confusion matrix
fig, ax = plt.subplots(figsize=(8,6), dpi=100)
conf_matrix = confusion_matrix(Y_valid,(y_valid_hybrid_pred  > 0.2273))
display = ConfusionMatrixDisplay(conf_matrix)
ax.set(title='Confusion Matrix for Validation data')
display.plot(ax=ax);
In [158]:
X_train_hybrid['pred']=y_train_hybrid_pred
X_test_hybrid['pred']=y_test_hybrid_pred
X_valid_hybrid['pred']=y_valid_hybrid_pred
In [159]:
#getting score for hybrid model
X_train_hybrid['score']=factor*np.log((1-X_train_hybrid['pred'])/X_train_hybrid['pred'])+offset
X_test_hybrid['score']=factor*np.log((1-X_test_hybrid['pred'])/X_test_hybrid['pred'])+offset
X_valid_hybrid['score']=factor*np.log((1-X_valid_hybrid['pred'])/X_valid_hybrid['pred'])+offset
In [160]:
#appending all feature
bs_df=X_train.append(X_test, ignore_index = True).append(X_valid, ignore_index = True)
In [161]:
#dropping earlier predicted value and score
bs_df=bs_df.drop(['pred','score'],axis=1)
In [162]:
#Getting prediction value
bs_df['pred']=result.predict(bs_df)
In [163]:
#getting score for total data
bs_df['score']=factor*np.log((1-bs_df['pred'])/bs_df['pred'])+offset
bs_df['score'] = np.where(bs_df['score'] < 300,300,bs_df['score'])
bs_df['score'] = np.where(bs_df['score'] >800,800,bs_df['score']) 
In [164]:
#creating grade from score 
bs_df['grade']=np.where(bs_df['score']<500,1,np.where(bs_df['score']>590,8,np.where(bs_df['score']<515.16,2,np.where(bs_df['score']<521.24,3,np.where(bs_df['score']<527.47,4,np.where(bs_df['score']<534.20,5,np.where(bs_df['score']<543.51,6,7)))))))

 
In [165]:
#creating appending all target variable
Y=Y_train.append(Y_test, ignore_index = True).append(Y_valid, ignore_index = True)
In [166]:
#merging to make complete dataset
bs_df=pd.concat([bs_df,Y],axis=1)
In [167]:
#grouping grade to get average of predition value and default value
pd_df=bs_df.groupby('grade')['pred','status_perf_end'].mean().reset_index()
In [168]:
#changing column names
pd_df.columns=['grade','ttc_pd','odr']
In [169]:
#ploting ttc_pd and odr based on grade
df_long=pd.melt(pd_df, id_vars=['grade'], value_vars=['ttc_pd','odr'])
fig = px.line(df_long, x='grade', y='value', color='variable',title='Calibration curve')
fig.show()
In [170]:
#for lgd,filtering columns id,time',lgd_time 
df_bs_lgd=df_bs_lgd[['id','time','lgd_time']]
In [171]:
#creting ID index column
df_bs_lgd['Id_index']=df_bs_lgd['id'].astype(str) +'-'+df_bs_lgd['time'].astype(str)
In [172]:
#taking subsets of columns
merged_data_lgd=merged_data.loc[:,['Id_index','snapshot','status_snapshot','status_perf_end']]
In [173]:
#merging datasets based id_idex
df_bs_lgd=pd.merge(df_bs_lgd, merged_data_lgd, on = 'Id_index')
In [174]:
#filtering datasets with status_perf_end equal to 1
df_bs_lgd=df_bs_lgd[df_bs_lgd.status_perf_end==1]
In [175]:
#filling  NA with missing
df_bs_lgd.lgd_time = df_bs_lgd.lgd_time.fillna('Missing')
In [176]:
#exclude rows that has value  Missing
df_bs_lgd=df_bs_lgd[df_bs_lgd.lgd_time!='Missing']
In [177]:
#converting lgd_time as float datatype
df_bs_lgd['lgd_time']=df_bs_lgd['lgd_time'].astype(float)
In [178]:
#create lgd_winsorized column 
df_bs_lgd['lgd_winsorized']=np.where(df_bs_lgd.lgd_time>1,1,df_bs_lgd.lgd_time)
In [179]:
df_bs_lgd
Out[179]:
id time lgd_time Id_index snapshot status_snapshot status_perf_end lgd_winsorized
257 262 49 0.553457 262-49 49 0.0 1.0 0.553457
1129 939 49 1.065037 939-49 49 0.0 1.0 1.000000
1158 954 49 0.286131 954-49 49 0.0 1.0 0.286131
2006 1756 49 0.808159 1756-49 49 0.0 1.0 0.808159
2556 2137 49 0.121957 2137-49 49 0.0 1.0 0.121957
... ... ... ... ... ... ... ... ...
86049 48740 49 0.669826 48740-49 49 0.0 1.0 0.669826
86213 48839 49 0.000000 48839-49 49 0.0 1.0 0.000000
86323 48887 49 0.408646 48887-49 49 0.0 1.0 0.408646
86823 49117 49 1.181174 49117-49 49 0.0 1.0 1.000000
86874 49141 49 0.398785 49141-49 49 0.0 1.0 0.398785

188 rows × 8 columns

In [180]:
#finding lgd winsorized average value
df_bs_lgd['lgd_winsorized'].mean()
Out[180]:
0.41378779534574484
In [181]:
#finding predition and no of observation based on grade
obliger=bs_df.groupby(['grade'])['pred'].count().reset_index()
In [182]:
#changing columns names
obliger.columns=['grade','no_of_obligers']
In [183]:
#merging the datasets
caliber=pd.merge(obliger, pd_df[['grade','ttc_pd']], on = 'grade')
In [184]:
#creating lod_odds column
caliber['log_odds']=np.log(caliber['ttc_pd']/(1-caliber['ttc_pd']))
In [185]:
#finding alpha and slope of regression
X=caliber['log_odds'].values.reshape(-1, 1)
Y=caliber['grade'].values.reshape(-1, 1)
regressor = LinearRegression()
regressor.fit(Y,X)
alpha=0.280713563995029
slope=-0.41993576
In [186]:
#creating shifted_log_odds column
caliber['shifted_log_odds']=alpha+slope*caliber['grade']
In [187]:
#creating PD column
caliber['PD'] = np.exp(caliber['shifted_log_odds'])/(1+np.exp(caliber['shifted_log_odds']))
In [188]:
caliber
Out[188]:
grade no_of_obligers ttc_pd log_odds shifted_log_odds PD
0 1 14925 0.479397 -0.082461 -0.139222 0.465251
1 2 19079 0.328770 -0.713754 -0.559158 0.363742
2 3 7823 0.254371 -1.075436 -0.979094 0.273072
3 4 7709 0.216242 -1.287703 -1.399029 0.197970
4 5 7295 0.180534 -1.512731 -1.818965 0.139558
5 6 8985 0.143655 -1.785261 -2.238901 0.096311
6 7 19021 0.080420 -2.436649 -2.658837 0.065446
7 8 2118 0.012156 -4.397730 -3.078773 0.043991
In [189]:
#subsetting of columns
residential_caliber=caliber[['grade','PD']]
In [190]:
#assign rho value
residential_caliber['rho']=.15
In [191]:
#assign lgd value
residential_caliber['lgd']=.41379
In [192]:
#finding dlgd value by putting values
residential_caliber['dlgd']=0.08+0.92*residential_caliber['lgd']
In [193]:
#finding b value by putting values
residential_caliber['b']=(0.11852-0.05478*np.log(residential_caliber['PD']))**2
In [194]:
#assign M value
residential_caliber['M']=5
In [195]:
#finding f(M) value by putting values
residential_caliber['f(M)']=(1+(residential_caliber['M']-2.5)*residential_caliber['b'])/(1-1.5*residential_caliber['b'])
In [196]:
#finding wcdr value by putting values 
residential_caliber['wcdr']=norm.cdf((norm.ppf(residential_caliber['PD'])+np.sqrt(residential_caliber['rho'])*norm.ppf(.999))/np.sqrt(1-residential_caliber['rho']))
In [197]:
 #finding k value by putting values 
residential_caliber['K']=residential_caliber['wcdr']*residential_caliber['dlgd']-residential_caliber['lgd']*residential_caliber['PD']
In [198]:
 residential_caliber
Out[198]:
grade PD rho lgd dlgd b M f(M) wcdr K
0 1 0.465251 0.15 0.41379 0.460687 0.025740 5 1.107094 0.885620 0.215477
1 2 0.363742 0.15 0.41379 0.460687 0.030248 5 1.126743 0.821262 0.227832
2 3 0.273072 0.15 0.41379 0.460687 0.035958 5 1.152032 0.740055 0.227939
3 4 0.197970 0.15 0.41379 0.460687 0.042950 5 1.183630 0.647063 0.216175
4 5 0.139558 0.15 0.41379 0.460687 0.051256 5 1.222098 0.549434 0.195369
5 6 0.096311 0.15 0.41379 0.460687 0.060868 5 1.267935 0.454225 0.169403
6 7 0.065446 0.15 0.41379 0.460687 0.071759 5 1.321660 0.366812 0.141905
7 8 0.043991 0.15 0.41379 0.460687 0.083891 5 1.383869 0.290335 0.115550